# coding:utf-8

import requests
from lxml import etree


# 获取豆瓣top250所有页的url
def get_urls():
    base_url = 'https://movie.douban.com/top250?start='
    urls = []
    for i in range(0, 251, 25):
        urls.append(base_url+str(i))
    return urls


# 读取html
def read_url_text(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'
    }
    response = requests.get(url, headers=headers)
    return response.text


# 解析html获取数据
def parse_html(text):
    page_data = []
    html = etree.HTML(text)
    lis = html.xpath('//ol[@class="grid_view"]/li')
    for i, ele in enumerate(lis):
        rank = ele.xpath('//div[@class="item"]//div[@class="pic"]//em/text()')[i]
        pic_path = ele.xpath('//div[@class="item"]//div[@class="pic"]//a/img/@src')[i]
        film_name = ele.xpath('//div[@class="item"]//div[@class="info"]//div[@class="hd"]/a/span[1]/text()')[i]
        page_data.append((rank, film_name, pic_path))
    return page_data


def main():
    urls = get_urls()
    result = []
    for url in urls:
        text = read_url_text(url)
        result.extend(parse_html(text))
    with open('db_top250.txt', 'w') as f:
        for i in result:
            f.write(str(i) + '\n')


if __name__ == '__main__':
    main()
